library(car)
library(mosaic)
library(DT)
library(tidyverse)

Read in the Data

files <- dir()
rdat <- read.csv(grep(".csv", files, value=TRUE), header=TRUE)
as_tibble(rdat)
# A tibble: 500 × 11
         y    x1      x2    x3     x4    x5     x6     x7      x8    x9     x10
     <dbl> <dbl>   <dbl> <int>  <dbl> <int>  <dbl>  <dbl>   <dbl> <dbl>   <dbl>
 1  0.0989 -7.14  -0.494     1  7.84      0 -7.03  -3.62  -4.81    1.58 -3.20  
 2 -1.99    7.28  -1.10      0 -0.984     0  6.93   7.11   1.06    7.50 -6.81  
 3 -0.0511 -7.59  -0.256     1  3.20      1 -2.42  -0.864 -5.31   -1.75  6.85  
 4 -0.714   6.73  -3.57      1  6.23      1 -1.41   0.669  7.10    5.65 -1.58  
 5  0.554  -2.13  -2.77      1  5.35      0  7.38  -5.41   0.0705 -7.32  2.65  
 6  2.00    3.17   0.158     0  3.75      1  0.497  3.09  -7.25   -1.20 -7.60  
 7 -0.275   1.11   1.38      1  2.35      0 -2.58   4.76   6.20   -7.64 -5.56  
 8  0.266  -3.46   1.33      1  5.49      1  2.09   4.38  -4.85   -7.49 -5.63  
 9 -0.199  -7.82   0.993     1 -5.45      0 -6.05  -4.59  -5.01   -6.43 -6.47  
10 15.7    -1.26 -78.6       1 -2.18      0  6.36  -4.74   1.06    7.08 -0.0902
# ℹ 490 more rows

Create first Pairs Plot

pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Start with 1/y transformation

rdat$y <- 1/rdat$y
pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Also transform x2 to 1/x2

rdat$x2 <- 1/rdat$x2
pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Try x9 as 1/x9

rdat$x9 <- 1/rdat$x9
pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Hmm… go back.

rdat <- read.csv(grep(".csv", files, value=TRUE), header=TRUE)
pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Try x2 and x5, and throw in an x2^2

lm1 <- lm(y ~ x2*x5*I(x2^2), data=rdat)
summary(lm1)
## 
## Call:
## lm(formula = y ~ x2 * x5 * I(x2^2), data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.529  -1.191   0.079   0.864  50.937 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -9.591e-01  2.449e-01  -3.916 0.000103 ***
## x2             3.730e-01  2.193e-02  17.005  < 2e-16 ***
## x5             2.177e+00  3.529e-01   6.169 1.43e-09 ***
## I(x2^2)        5.018e-03  2.799e-04  17.928  < 2e-16 ***
## x2:x5         -2.801e-01  4.046e-02  -6.923 1.38e-11 ***
## x2:I(x2^2)    -2.456e-05  1.040e-06 -23.607  < 2e-16 ***
## x5:I(x2^2)    -1.283e-02  8.119e-04 -15.808  < 2e-16 ***
## x2:x5:I(x2^2)  5.817e-06  5.389e-06   1.079 0.280913    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.88 on 492 degrees of freedom
## Multiple R-squared:  0.9858, Adjusted R-squared:  0.9856 
## F-statistic:  4870 on 7 and 492 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=as.factor(rdat$x5))

Drop cubic with x5

lm2 <- lm(y ~ x2 + I(x2^2) + I(x2^3) + 
            x5 + x5:x2 + x5:I(x2^2), data=rdat)
summary(lm2)
## 
## Call:
## lm(formula = y ~ x2 + I(x2^2) + I(x2^3) + x5 + x5:x2 + x5:I(x2^2), 
##     data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.884  -1.175   0.080   0.899  50.713 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.656e-01  2.449e-01  -3.943 9.20e-05 ***
## x2           3.732e-01  2.194e-02  17.015  < 2e-16 ***
## I(x2^2)      5.072e-03  2.754e-04  18.414  < 2e-16 ***
## I(x2^3)     -2.434e-05  1.021e-06 -23.843  < 2e-16 ***
## x5           2.143e+00  3.515e-01   6.096 2.20e-09 ***
## x2:x5       -2.736e-01  4.002e-02  -6.838 2.39e-11 ***
## I(x2^2):x5  -1.214e-02  4.899e-04 -24.772  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.881 on 493 degrees of freedom
## Multiple R-squared:  0.9857, Adjusted R-squared:  0.9856 
## F-statistic:  5679 on 6 and 493 DF,  p-value: < 2.2e-16

Zoom in

ggplot(rdat, aes(x=x2, y=y, color=interaction(x5))) + 
  geom_point() + 
  geom_line(aes(y=lm2$fit)) + 
  facet_wrap(~interaction(x5))

Add x3?

ggplot(rdat, aes(x=x2, y=y, color=interaction(x5,x3))) + 
  geom_point() + 
  geom_line(aes(y=lm2$fit)) + 
  facet_wrap(~interaction(x5))

lm3 <- lm(y ~ x2 + I(x2^2) + I(x2^3) + 
            x5 + x5:x2 + x5:I(x2^2) + 
            x3 + x3:x2 + 
            x3:x5 + x3:x5:x2, data=rdat)
summary(lm3)
## 
## Call:
## lm(formula = y ~ x2 + I(x2^2) + I(x2^3) + x5 + x5:x2 + x5:I(x2^2) + 
##     x3 + x3:x2 + x3:x5 + x3:x5:x2, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.648  -0.450  -0.263   0.162  20.193 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.677e+00  2.811e-01  -5.965 4.70e-09 ***
## x2           2.362e-02  2.783e-02   0.849  0.39635    
## I(x2^2)      7.671e-03  2.697e-04  28.448  < 2e-16 ***
## I(x2^3)     -9.185e-06  1.236e-06  -7.430 4.88e-13 ***
## x5           3.590e+00  3.906e-01   9.191  < 2e-16 ***
## x3           1.220e+00  3.842e-01   3.175  0.00159 ** 
## x2:x5       -9.518e-02  4.692e-02  -2.028  0.04306 *  
## I(x2^2):x5  -1.575e-02  5.168e-04 -30.482  < 2e-16 ***
## x2:x3        5.714e-01  3.530e-02  16.189  < 2e-16 ***
## x5:x3       -2.616e+00  5.512e-01  -4.746 2.73e-06 ***
## x2:x5:x3    -2.889e-01  6.336e-02  -4.560 6.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.056 on 489 degrees of freedom
## Multiple R-squared:  0.9912, Adjusted R-squared:  0.991 
## F-statistic:  5526 on 10 and 489 DF,  p-value: < 2.2e-16
ggplot(rdat, aes(x=x2, y=y, color=interaction(x5,x3))) + 
  geom_point() + 
  geom_line(aes(y=lm3$fit)) + 
  facet_wrap(~interaction(x5,x3))

Increase terms

lm4 <- lm(y ~ x2 + I(x2^2) + I(x2^3) + 
            x5 + x5:x2 + x5:I(x2^2) + 
            x3 + x3:x2 + x3:I(x2^2) + 
            x3:x5 + x3:x5:x2 + x3:x5:I(x2^2), data=rdat)
summary(lm4)
## 
## Call:
## lm(formula = y ~ x2 + I(x2^2) + I(x2^3) + x5 + x5:x2 + x5:I(x2^2) + 
##     x3 + x3:x2 + x3:I(x2^2) + x3:x5 + x3:x5:x2 + x3:x5:I(x2^2), 
##     data = rdat)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.751e-13  1.500e-16  1.950e-15  5.960e-15  1.070e-13 
## 
## Coefficients:
##                 Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)   -2.000e+00  3.340e-15 -5.988e+14  < 2e-16 ***
## x2             1.000e-03  3.315e-16  3.017e+12  < 2e-16 ***
## I(x2^2)        1.000e-02  3.435e-18  2.911e+15  < 2e-16 ***
## I(x2^3)        4.114e-20  1.548e-20  2.657e+00  0.00814 ** 
## x5             4.000e+00  4.640e-15  8.622e+14  < 2e-16 ***
## x3             2.000e+00  4.580e-15  4.367e+14  < 2e-16 ***
## x2:x5         -1.000e-03  5.742e-16 -1.741e+12  < 2e-16 ***
## I(x2^2):x5    -2.000e-02  6.614e-18 -3.024e+15  < 2e-16 ***
## x2:x3         -2.010e-01  6.244e-16 -3.219e+14  < 2e-16 ***
## I(x2^2):x3    -1.000e-02  5.725e-18 -1.747e+15  < 2e-16 ***
## x5:x3         -4.000e+00  6.616e-15 -6.046e+14  < 2e-16 ***
## x2:x5:x3       4.010e-01  8.944e-16  4.484e+14  < 2e-16 ***
## I(x2^2):x5:x3  2.000e-02  1.511e-17  1.324e+15  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.626e-14 on 487 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 3.3e+31 on 12 and 487 DF,  p-value: < 2.2e-16
ggplot(rdat, aes(x=x2, y=y, color=interaction(x5,x3))) + 
  geom_point() + 
  geom_line(aes(y=lm4$fit)) + 
  facet_wrap(~interaction(x5,x3))

Final Guess

lm

final.lm <- lm(y ~ x2 + I(x2^2) + I(x2^3) + 
            x5 + x5:x2 + x5:I(x2^2) + 
            x3 + x3:x2 + x3:I(x2^2) + 
            x3:x5 + x3:x5:x2 + x3:x5:I(x2^2), data=rdat)
summary(final.lm)

Call:
lm(formula = y ~ x2 + I(x2^2) + I(x2^3) + x5 + x5:x2 + x5:I(x2^2) + 
    x3 + x3:x2 + x3:I(x2^2) + x3:x5 + x3:x5:x2 + x3:x5:I(x2^2), 
    data = rdat)

Residuals:
       Min         1Q     Median         3Q        Max 
-6.751e-13  1.500e-16  1.950e-15  5.960e-15  1.070e-13 

Coefficients:
                Estimate Std. Error    t value Pr(>|t|)    
(Intercept)   -2.000e+00  3.340e-15 -5.988e+14  < 2e-16 ***
x2             1.000e-03  3.315e-16  3.017e+12  < 2e-16 ***
I(x2^2)        1.000e-02  3.435e-18  2.911e+15  < 2e-16 ***
I(x2^3)        4.114e-20  1.548e-20  2.657e+00  0.00814 ** 
x5             4.000e+00  4.640e-15  8.622e+14  < 2e-16 ***
x3             2.000e+00  4.580e-15  4.367e+14  < 2e-16 ***
x2:x5         -1.000e-03  5.742e-16 -1.741e+12  < 2e-16 ***
I(x2^2):x5    -2.000e-02  6.614e-18 -3.024e+15  < 2e-16 ***
x2:x3         -2.010e-01  6.244e-16 -3.219e+14  < 2e-16 ***
I(x2^2):x3    -1.000e-02  5.725e-18 -1.747e+15  < 2e-16 ***
x5:x3         -4.000e+00  6.616e-15 -6.046e+14  < 2e-16 ***
x2:x5:x3       4.010e-01  8.944e-16  4.484e+14  < 2e-16 ***
I(x2^2):x5:x3  2.000e-02  1.511e-17  1.324e+15  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.626e-14 on 487 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 3.3e+31 on 12 and 487 DF,  p-value: < 2.2e-16

Math Model

\[ Y_i = \beta_0 + \beta_1 X_{2i} + \beta_2 X_{2i}^2 + \beta_3 X_{2i}^3 + \\ \beta_4 X_{5i} + \beta_5 X_{5i} X_{2i} + \beta_6 X_{5i} X_{2i}^2 + \\ \beta_7 X_{3i} + \beta_8 X_{3i} X_{2i} + \beta_9 X_{3i} X_{2i}^2 + \\ \beta_{10} X_{3i} X_{5i} + \beta_{11} X_{3i} X_{5i} X_{2i} + \beta_{12} X_{3i} X_{5i} X_{2i}^2 + \epsilon_i \]

Plot

plot(y ~ x2, data=rdat, col=interaction(x5,x3))
points(final.lm$fit ~ x2, data=rdat, col=interaction(x5,x3), pch=16, cex=0.5)

b <- coef(final.lm)

drawit <- function(x5=0, x3=0, i=1){
  curve(b[1] + b[2]*x2 + b[3]*x2^2 + b[4]*x2^3 + b[5]*x5 + b[6]*x3 + b[7]*x2*x5 + b[8]*x2^2*x5 + b[9]*x2*x3 + b[10]*x2^2*x3 + b[11]*x5*x3 + b[12]*x2*x5*x3 + b[13]*x2^2*x5*x3, add=TRUE, xname="x2", col=palette()[i])  
}

drawit(0,0,1)
drawit(1,0,2)
drawit(0,1,3)
drawit(1,1,4)

b
##   (Intercept)            x2       I(x2^2)       I(x2^3)            x5 
## -2.000000e+00  1.000000e-03  1.000000e-02  4.114207e-20  4.000000e+00 
##            x3         x2:x5    I(x2^2):x5         x2:x3    I(x2^2):x3 
##  2.000000e+00 -1.000000e-03 -2.000000e-02 -2.010000e-01 -1.000000e-02 
##         x5:x3      x2:x5:x3 I(x2^2):x5:x3 
## -4.000000e+00  4.010000e-01  2.000000e-02
with(rdat, levels(interaction(x5,x3)))
## [1] "0.0" "1.0" "0.1" "1.1"